In this hands-on session, we will show how principal component analysis (PCA) is used in common bioinformatics workflows. We will use a toy single-cell RNA-seq (scRNA-seq) dataset to illustrate it.
The main objectives are the following:
library(pheatmap)
library(tidyverse)
## āā Attaching packages āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā tidyverse 1.3.0 āā
## ā ggplot2 3.3.2 ā purrr 0.3.4
## ā tibble 3.0.3 ā dplyr 1.0.2
## ā tidyr 1.1.2 ā stringr 1.4.0
## ā readr 1.3.1 ā forcats 0.5.0
## Warning: package 'tibble' was built under R version 4.0.2
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
## āā Conflicts āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā tidyverse_conflicts() āā
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Create toy dataset
toy <- data.frame(
CD3D = c(4, 5, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
CD3E = c(5, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
LYZ = c(0, 0, 0, 0, 5, 7, 6, 0, 0, 0, 0, 0, 0, 0),
S100A8 = c(0, 0, 0, 0, 5, 7, 6, 0, 0, 0, 0, 0, 0, 0),
CD79A = c(0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 3, 5, 0, 0),
CD79B = c(0, 0, 0, 0, 0, 0, 0, 5, 5, 3, 4, 6, 0, 0),
BLNK = c(0, 0, 0, 0, 0, 0, 0, 6, 4, 5, 5, 4, 0, 0),
TOP2A = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 9, 0, 0),
MKI = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 10, 0, 0),
MT_CO1 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 23),
MT_ND5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 25)
)
rownames(toy) <- paste("cell", as.character(1:nrow(toy)))
toy <- as.matrix(toy)
# Visualize
pheatmap(toy, cluster_cols = FALSE, cluster_rows = FALSE, angle_col = 45)
Images obtained from this link
Three ways to perform PCA in R:
# Regardless of the method, we will center and scale the dataset
toy_scaled <- scale(toy, center = TRUE, scale = TRUE)
# Eigendecomposition of the correlation matrix
corr_matrix <- (1 / (nrow(toy_scaled) - 1)) * (t(toy_scaled) %*% toy_scaled)
eigen_corr_matrix <- eigen(corr_matrix)
eigen_corr_matrix$values
## [1] 4.585631e+00 2.611553e+00 2.386451e+00 1.224444e+00 7.936737e-02
## [6] 6.287425e-02 3.811161e-02 9.510946e-03 1.709826e-03 3.466273e-04
## [11] 2.806999e-16
pc_mat_by_eigen <- toy %*% eigen_corr_matrix$vectors
# SVD
toy_svd <- svd(toy)
toy_svd$d
## [1] 4.427641e+01 2.493893e+01 1.483240e+01 1.217057e+01 1.135782e+01
## [6] 2.408677e+00 1.732051e+00 1.264782e+00 1.023020e+00 2.807812e-01
## [11] 2.589463e-15
toy_svd$v
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.0000000 0.0000000 0.0000000 0.0000000 -0.7071068 0.0000000
## [2,] 0.0000000 0.0000000 0.0000000 0.0000000 -0.7071068 0.0000000
## [3,] 0.0000000 0.0000000 -0.7071068 0.0000000 0.0000000 0.0000000
## [4,] 0.0000000 0.0000000 -0.7071068 0.0000000 0.0000000 0.0000000
## [5,] 0.0000000 -0.3248565 0.0000000 -0.4912851 0.0000000 -0.1731681
## [6,] 0.0000000 -0.3690519 0.0000000 -0.4017359 0.0000000 -0.5908207
## [7,] 0.0000000 -0.3611502 0.0000000 -0.4785710 0.0000000 0.7111869
## [8,] 0.0000000 -0.5612625 0.0000000 0.4392304 0.0000000 0.2546330
## [9,] 0.0000000 -0.5593066 0.0000000 0.4186809 0.0000000 -0.2243186
## [10,] -0.6592829 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [11,] -0.7518950 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [,7] [,8] [,9] [,10] [,11]
## [1,] -0.7071068 0.0000000 0.0000000 0.0000000 0.0000000
## [2,] 0.7071068 0.0000000 0.0000000 0.0000000 0.0000000
## [3,] 0.0000000 0.0000000 0.0000000 0.0000000 0.7071068
## [4,] 0.0000000 0.0000000 0.0000000 0.0000000 -0.7071068
## [5,] 0.0000000 0.0000000 -0.6350973 -0.4687979 0.0000000
## [6,] 0.0000000 0.0000000 0.5856886 0.1015318 0.0000000
## [7,] 0.0000000 0.0000000 0.1003795 0.3530970 0.0000000
## [8,] 0.0000000 0.0000000 0.3047222 -0.5782452 0.0000000
## [9,] 0.0000000 0.0000000 -0.3881864 0.5575615 0.0000000
## [10,] 0.0000000 0.7518950 0.0000000 0.0000000 0.0000000
## [11,] 0.0000000 -0.6592829 0.0000000 0.0000000 0.0000000
toy_svd$u
## [,1] [,2] [,3] [,4] [,5]
## [1,] 5.243443e-17 -1.227325e-17 0.0000000 0.0000000 -0.5603155
## [2,] -4.293690e-18 4.137727e-18 0.0000000 0.0000000 -0.4980582
## [3,] -1.300960e-18 -8.310435e-18 0.0000000 0.0000000 -0.4980582
## [4,] -6.102181e-17 2.054870e-17 0.0000000 0.0000000 -0.4358010
## [5,] -1.274609e-16 2.075731e-16 -0.4767313 0.0000000 0.0000000
## [6,] -4.357354e-17 -1.325069e-16 -0.6674238 0.0000000 0.0000000
## [7,] 1.570532e-16 -1.838621e-17 -0.5720776 0.0000000 0.0000000
## [8,] 1.072358e-30 -2.390359e-01 0.0000000 -0.6431757 0.0000000
## [9,] -5.669938e-31 -1.840210e-01 0.0000000 -0.4837984 0.0000000
## [10,] -8.135128e-31 -1.689059e-01 0.0000000 -0.4571028 0.0000000
## [11,] 5.546678e-32 -6.874414e-01 0.0000000 0.3617415 0.0000000
## [12,] -7.395571e-32 -6.386652e-01 0.0000000 0.1116429 0.0000000
## [13,] -6.416234e-01 0.000000e+00 0.0000000 0.0000000 0.0000000
## [14,] -7.670198e-01 0.000000e+00 0.0000000 0.0000000 0.0000000
## [,6] [,7] [,8] [,9] [,10]
## [1,] 3.922781e-17 4.082483e-01 8.995782e-16 9.042871e-16 2.442663e-15
## [2,] -1.018141e-17 -8.164966e-01 -7.366362e-17 -1.742673e-16 -4.195345e-16
## [3,] 1.819192e-17 -2.081668e-16 -2.231959e-17 2.531632e-16 5.430541e-16
## [4,] -5.959062e-17 4.082483e-01 -1.046905e-15 -1.252822e-15 -3.281733e-15
## [5,] 1.014303e-16 0.000000e+00 -2.186751e-15 -1.965896e-15 -5.745775e-15
## [6,] -1.122468e-17 0.000000e+00 -7.475586e-16 8.546639e-16 1.646529e-15
## [7,] -7.142979e-17 0.000000e+00 2.694444e-15 6.411390e-16 2.867195e-15
## [8,] 1.137592e-01 0.000000e+00 1.814380e-29 -2.735664e-01 -6.643848e-01
## [9,] -3.329747e-01 0.000000e+00 -1.005798e-29 7.718049e-01 1.597511e-01
## [10,] 4.528627e-01 0.000000e+00 -1.400228e-29 -2.750934e-01 6.940942e-01
## [11,] 5.236231e-01 0.000000e+00 1.084684e-30 3.186414e-01 -1.444085e-01
## [12,] -6.300166e-01 0.000000e+00 -1.774937e-30 -3.902178e-01 1.745046e-01
## [13,] 0.000000e+00 0.000000e+00 -7.670198e-01 0.000000e+00 0.000000e+00
## [14,] 0.000000e+00 0.000000e+00 6.416234e-01 0.000000e+00 0.000000e+00
## [,11]
## [1,] 0.44082023
## [2,] 0.26724697
## [3,] -0.84513423
## [4,] 0.09367371
## [5,] 0.01102435
## [6,] 0.06394121
## [7,] -0.08378503
## [8,] 0.00000000
## [9,] 0.00000000
## [10,] 0.00000000
## [11,] 0.00000000
## [12,] 0.00000000
## [13,] 0.00000000
## [14,] 0.00000000
# PCA
pca_out <- prcomp(toy, scale = TRUE)
# The singular values, PC scores and gene loadings are in the sdev, x and
# rotation, respectively
pca_out$sdev
## [1] 2.141409e+00 1.616030e+00 1.544814e+00 1.106546e+00 2.817222e-01
## [6] 2.507474e-01 1.952219e-01 9.752408e-02 4.135004e-02 1.861793e-02
## [11] 1.382766e-16
pca_out$x
## PC1 PC2 PC3 PC4 PC5
## cell 1 -1.700709 -2.277481608 0.7241901181 0.2151054 0.102593341
## cell 2 -1.584322 -2.016467915 0.6342759544 0.1682310 0.015219243
## cell 3 -1.584322 -2.016467915 0.6342759544 0.1682310 0.015219243
## cell 4 -1.467934 -1.755454222 0.5443617907 0.1213565 -0.072154855
## cell 5 -1.312695 1.959136235 1.1183531824 0.1207163 -0.106225187
## cell 6 -1.576484 2.714134076 1.5997093974 0.2517088 0.124794156
## cell 7 -1.444590 2.336635156 1.3590312899 0.1862125 0.009284485
## cell 8 2.510082 -0.033447419 0.0260575956 -2.0671324 0.047180495
## cell 9 1.771588 -0.009409052 0.0008410154 -1.5643475 0.247975589
## cell 10 1.572523 -0.002108551 -0.0071397911 -1.5316980 -0.521378439
## cell 11 3.617468 -0.102692034 0.1124176895 2.2198664 -0.514172706
## cell 12 3.841249 -0.104927920 0.1126449068 1.2602777 0.615767683
## cell 13 -1.258998 0.600219564 -3.1191919099 0.1855753 -0.048745593
## cell 14 -1.382856 0.708331605 -3.7398271936 0.2658970 0.084642545
## PC6 PC7 PC8 PC9 PC10
## cell 1 3.690902e-01 -0.188473794 0.060388488 0.0160626561 -0.015949470
## cell 2 -7.381803e-01 -0.009172048 -0.002134342 -0.0009781609 0.001044560
## cell 3 -6.453303e-16 -0.009172048 -0.002134342 -0.0009781609 0.001044560
## cell 4 3.690902e-01 0.170129697 -0.064657171 -0.0180189779 0.018038591
## cell 5 2.540770e-17 0.241320835 -0.089781300 -0.0248886041 0.024893168
## cell 6 -6.832934e-16 -0.232247598 0.075232971 0.0200778331 -0.019948286
## cell 7 -3.289428e-16 0.004536618 -0.007274165 -0.0024053855 0.002472441
## cell 8 3.388821e-16 -0.357036308 -0.022120629 0.0244399174 0.034497753
## cell 9 -7.149511e-16 0.386089428 0.209331975 -0.0009399500 -0.005722765
## cell 10 2.171734e-15 0.018993895 -0.122504094 -0.0286935148 -0.037800303
## cell 11 2.449428e-15 -0.034305911 0.119468501 0.0089819482 0.009747667
## cell 12 -1.976915e-15 0.030747615 -0.147140221 -0.0108673024 -0.011766229
## cell 13 5.461687e-16 0.129595638 -0.061521039 0.1061315311 -0.003028819
## cell 14 3.097140e-16 -0.151006018 0.054845368 -0.0879238294 0.002477131
## PC11
## cell 1 9.972792e-17
## cell 2 -2.333390e-16
## cell 3 -6.680553e-17
## cell 4 -6.680553e-17
## cell 5 1.552391e-16
## cell 6 3.772837e-16
## cell 7 3.772837e-16
## cell 8 2.589384e-17
## cell 9 -2.302651e-16
## cell 10 -3.429329e-16
## cell 11 -4.703935e-16
## cell 12 -2.336826e-16
## cell 13 1.461166e-16
## cell 14 4.071682e-16
pca_out$rotation
## PC1 PC2 PC3 PC4 PC5 PC6
## CD3D -0.2229758 -0.50005276 0.17225849 0.08980263 0.1673922 -7.071068e-01
## CD3E -0.2229758 -0.50005276 0.17225849 0.08980263 0.1673922 7.071068e-01
## LYZ -0.1704618 0.48788427 0.31105536 0.08464816 0.1492861 -3.330669e-16
## S100A8 -0.1704618 0.48788427 0.31105536 0.08464816 0.1492861 -5.828671e-16
## CD79A 0.4308542 -0.01396106 0.01460081 -0.32621575 0.3088705 -1.165734e-15
## CD79B 0.4498668 -0.01565046 0.01682340 -0.16707620 0.6328444 -2.192690e-15
## BLNK 0.4378239 -0.01431917 0.01506855 -0.26297814 -0.5738734 2.525757e-15
## TOP2A 0.3390601 -0.01661800 0.01974082 0.61846364 -0.1578042 9.159340e-16
## MKI 0.3436031 -0.01680972 0.01995000 0.60916732 0.1273774 -7.494005e-16
## MT_CO1 -0.1216850 0.10629113 -0.61027681 0.07911621 0.1380316 -1.942890e-16
## MT_ND5 -0.1221137 0.10644691 -0.61089638 0.07880491 0.1182514 -3.053113e-16
## PC7 PC8 PC9 PC10 PC11
## CD3D -0.34350816 0.1197819 0.03264698 -0.03255734 0.000000e+00
## CD3E -0.34350816 0.1197819 0.03264698 -0.03255734 3.071566e-16
## LYZ -0.30602284 0.1066332 0.02905759 -0.02897682 -7.071068e-01
## S100A8 -0.30602284 0.1066332 0.02905759 -0.02897682 7.071068e-01
## CD79A -0.41747376 -0.5778181 0.01412364 0.32215102 6.076466e-16
## CD79B 0.21220710 0.5574800 0.04063081 -0.10591897 -3.092444e-17
## BLNK -0.45774028 0.3355090 0.01577650 -0.29503993 -3.373183e-16
## TOP2A -0.07059582 0.2292571 0.11634947 0.63709746 7.997109e-16
## MKI -0.03694660 -0.3331095 -0.11632125 -0.60670324 -8.104585e-16
## MT_CO1 -0.30655181 0.1659499 -0.67034752 0.07256633 4.387840e-16
## MT_ND5 -0.21803472 0.0167084 0.71947675 -0.12208269 -8.877856e-17
Proportion of variance explained (PVE): https://faculty.marshall.usc.edu/gareth-james/ISL/ISLR%20Seventh%20Printing.pdf
summary(pca_out)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1414 1.6160 1.5448 1.1065 0.28172 0.25075 0.19522
## Proportion of Variance 0.4169 0.2374 0.2170 0.1113 0.00722 0.00572 0.00346
## Cumulative Proportion 0.4169 0.6543 0.8712 0.9826 0.98977 0.99548 0.99895
## PC8 PC9 PC10 PC11
## Standard deviation 0.09752 0.04135 0.01862 1.383e-16
## Proportion of Variance 0.00086 0.00016 0.00003 0.000e+00
## Cumulative Proportion 0.99981 0.99997 1.00000 1.000e+00
# Calculate percentage of variance explained (PVE):
pve <- pca_out$sdev ** 2 / sum(pca_out$sdev ** 2) * 100
pve <- round(pve, 2)
pve_df <- data.frame(principal_component = 1:length(pve), pve = pve)
pve_gg <- pve_df %>%
ggplot(aes(principal_component, pve)) +
geom_point() +
geom_vline(xintercept = 4.5, linetype = "dashed", color = "darkblue") +
scale_x_continuous(breaks = 1:length(pve)) +
labs(x = "Principal Component", y = "Percentage of Variance Explained (%)") +
theme_bw()
pve_gg
Thus, we conclude that the first 4 PC are significant. Let us express each cell as a vector of the first 4 PC:
toy_reduced <- pca_out$x[, c("PC1", "PC2", "PC3", "PC4")]
gene_loads_reduced <- pca_out$rotation[, c("PC1", "PC2", "PC3", "PC4")]
pheatmap(toy_reduced, cluster_rows = FALSE, cluster_cols = FALSE, angle_col = 45)
significant_pcs <- c("PC1", "PC2", "PC3", "PC4")
loadings_gg <- map(significant_pcs, function(x) {
loadings <- gene_loads_reduced[, x]
df <- data.frame(gene = names(loadings), score = loadings)
p <- df %>%
ggplot(aes(fct_reorder(gene, score), score)) +
geom_point() +
labs(x = "", y = x) +
theme_bw() +
coord_flip()
return(p)
})
Interpretation of the PC:
PC1: B cell identity. PC2: T vs monocyte separation. PC3: technical variation. PC4: cell cycle.
dist_mat <- dist(toy_reduced, method = "euclidean")
pheatmap(dist_mat, cluster_rows = FALSE, cluster_cols = FALSE)
hclust_average <- hclust(dist_mat, method = "average")
plot(
hclust_average,
labels = rownames(toy),
main = "Average Linkage",
xlab = "",
sub = "",
ylab = ""
)
plot(
hclust_average,
labels = rownames(toy),
main = "Average Linkage",
xlab = "",
sub = "",
ylab = ""
)
abline(h = 2.5, col = "red")
hc_clusters <- cutree(hclust_average, h = 2.5)
hc_clusters
## cell 1 cell 2 cell 3 cell 4 cell 5 cell 6 cell 7 cell 8 cell 9 cell 10
## 1 1 1 1 2 2 2 3 3 3
## cell 11 cell 12 cell 13 cell 14
## 4 4 5 5
table(hc_clusters)
## hc_clusters
## 1 2 3 4 5
## 4 3 3 2 2
annotation_rows <- data.frame(cluster = as.character(hc_clusters))
rownames(annotation_rows) <- names(hc_clusters)
pheatmap(
toy_scaled,
cluster_rows = FALSE,
cluster_cols = FALSE,
angle_col = 315,
annotation_row = annotation_rows
)
| Cluster | Cell type |
|---|---|
| 1 | T-cells |
| 2 | Monocytes |
| 3 | Naive B-cells |
| 4 | Plasma Cells |
| 5 | poor-quality cells |
annotated_clusters <- c("T-cells", "Monocytes", "Naive B-cells", "Plasma Cells", "poor-quality cells")
names(annotated_clusters) <- as.character(1:5)
annotation_rows$annotation <- annotated_clusters[annotation_rows$cluster]
pheatmap(
toy_scaled,
cluster_rows = FALSE,
cluster_cols = FALSE,
angle_col = 315,
annotation_row = annotation_rows
)
hierarchical clustering: concept of distance, pairwise distance.
sessionInfo()
## R version 4.0.1 (2020-06-06)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [5] readr_1.3.1 tidyr_1.1.2 tibble_3.0.3 ggplot2_3.3.2
## [9] tidyverse_1.3.0 pheatmap_1.0.12 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.18 haven_2.3.1
## [4] colorspace_1.4-1 vctrs_0.3.4 generics_0.0.2
## [7] htmltools_0.5.0 yaml_2.2.1 blob_1.2.1
## [10] rlang_0.4.7 pillar_1.4.6 withr_2.3.0
## [13] glue_1.4.2 DBI_1.1.0 RColorBrewer_1.1-2
## [16] dbplyr_1.4.4 modelr_0.1.8 readxl_1.3.1
## [19] lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0
## [22] cellranger_1.1.0 rvest_0.3.6 evaluate_0.14
## [25] labeling_0.3 knitr_1.30 fansi_0.4.1
## [28] broom_0.7.1 Rcpp_1.0.5 scales_1.1.1
## [31] backports_1.1.10 BiocManager_1.30.10 magick_2.4.0
## [34] jsonlite_1.7.1 farver_2.0.3 fs_1.5.0
## [37] hms_0.5.3 digest_0.6.25 stringi_1.5.3
## [40] bookdown_0.20 grid_4.0.1 cli_2.0.2
## [43] tools_4.0.1 magrittr_1.5 crayon_1.3.4
## [46] pkgconfig_2.0.3 ellipsis_0.3.1 xml2_1.3.2
## [49] reprex_0.3.0 lubridate_1.7.9 rstudioapi_0.11
## [52] assertthat_0.2.1 rmarkdown_2.4 httr_1.4.2
## [55] R6_2.4.1 compiler_4.0.1